#########################################################################
### Unexpected Responses on Amazon Mechanical Turk                    ###
### Follow-Up Survey with Indian and U.S. Respondents                 ###
#########################################################################


library(rIP)
library(profvis)
library(ggmap)
library(plyr)
library(dplyr)
library(stringr)
library(revgeo)


mturk<-read.csv("MTurkFollowup.csv", header=T, stringsAsFactors = F)

###IP Address Checking

#Resercher needs to fill in information here
iphub_key<-"Put your IP Hub key here"
ipintel_key<-"Put your e-mail here"
proxycheck_key<-"Put your Proxycheck key here"

#IP Hub
#iphub_info<-getIPinfo(mturk, "IPAddress", iphub_key, plots = TRUE)
#save(iphub_info, file="iphub_info.RData")


#IP Intel
ipintel_df<-data.frame(matrix(nrow=302, ncol=3))
colnames(ipintel_df)<-c("Ipintel_Rating", "Ipintel_IP", "Ipintel_BadIP")

#for(i in 1:302){
#  tryCatch({
#    print(i)
#  temp<-getipintel(mturk$IPAddress[i], flags="f", oflags="b", contact_info="asdf@asdf.com")
#  temp1<-as.data.frame(temp)
#  temp1<-temp1[,c(2,3,8)]
#  ipintel_df[i,1]<-temp1[1,1]
#  ipintel_df[i,2]<-temp1[1,2]
#  ipintel_df[i,3]<-temp1[1,3]
#  pause(6)
#  }, error=function(e){})
#}

#save(ipintel_df, file="ipintel_df.RData")


#Get more detailed location information
proxycheck_df<-data.frame(matrix(nrow=302, ncol=11))
colnames(proxycheck_df)<-c("Proxycheck_Status", "Proxycheck_ISP", "Proxycheck_Country", "Proxycheck_CountryISO", 
                   "Proxycheck_Region", "Proxycheck_Regioncode", "Proxycheck_City",
                   "Proxycheck_Lat", "Proxycheck_Long", "Proxycheck_Risk", "Proxycheck_IPAddress")

#for(i in 1:302){
#  tryCatch({
#    print(i)
#temp<-proxycheck(mturk$IPAddress[i], api_key=proxycheck_key, vpn=T, asn=T, risk=T, seen=T)
#temp1<-as.data.frame(temp)
#temp1<-temp1[,c(1,3,5:11,14)]
#temp1$IP<-mturk$APAddress[i]
#proxycheck_df[i,1]<-temp1[1,1]
#proxycheck_df[i,2]<-temp1[1,2]
#proxycheck_df[i,3]<-temp1[1,3]
#proxycheck_df[i,4]<-temp1[1,4]
#proxycheck_df[i,5]<-temp1[1,5]
#proxycheck_df[i,6]<-temp1[1,6]
#proxycheck_df[i,7]<-temp1[1,7]
#proxycheck_df[i,8]<-temp1[1,8]
#proxycheck_df[i,9]<-temp1[1,9]
#proxycheck_df[i,10]<-temp1[1,10]
#proxycheck_df[i,10]<-temp1[1,11]
#pause(1)
#  }, error=function(e){})
#}


#proxycheck_df$Proxycheck_IPAddress<-mturk$IPAddress

#save(proxycheck_df, file="proxycheck_df.RData")

#Load and merge IP address detection
#load("iphub_info.RData")
#load("ipintel_df.RData")
#load("proxycheck_df.RData")
proxycheck_df<-unique(proxycheck_df)
ipintel_df<-unique(ipintel_df)
#ip2<-merge(iphub_info, proxycheck_df, by.x="IPAddress", by.y="Proxycheck_IPAddress",  all.x=T)
#ip3<-merge(ip2, ipintel_df, by.x="IPAddress", by.y="Ipintel_IP", all.x=T)
#ip3<-unique(ip3)
#ip3<-ip3[-125,]

#mturk2<-merge(mturk, ip3, by="IPAddress", all.x=T)
#mturk2$IPAddress<-sample(0:1)
#save(mturk2, file="mturk2.RData")
load("mturk2.RData")

#Match on Country
mturk2$IP_Hub_Country<-ifelse(mturk2$IP_Hub_Country_Code=="BR", "Brazil", NA)
mturk2$IP_Hub_Country<-ifelse(mturk2$IP_Hub_Country_Code=="CA", "Canada", mturk2$IP_Hub_Country)
mturk2$IP_Hub_Country<-ifelse(mturk2$IP_Hub_Country_Code=="IN", "India", mturk2$IP_Hub_Country)
mturk2$IP_Hub_Country<-ifelse(mturk2$IP_Hub_Country_Code=="LK", "Sri Lanka", mturk2$IP_Hub_Country)
mturk2$IP_Hub_Country<-ifelse(mturk2$IP_Hub_Country_Code=="LY", "Libya", mturk2$IP_Hub_Country)
mturk2$IP_Hub_Country<-ifelse(mturk2$IP_Hub_Country_Code=="MX", "Mexico", mturk2$IP_Hub_Country)
mturk2$IP_Hub_Country<-ifelse(mturk2$IP_Hub_Country_Code=="OM", "Oman", mturk2$IP_Hub_Country)
mturk2$IP_Hub_Country<-ifelse(mturk2$IP_Hub_Country_Code=="SE", "Sweden", mturk2$IP_Hub_Country)
mturk2$IP_Hub_Country<-ifelse(mturk2$IP_Hub_Country_Code=="SG", "Singapore", mturk2$IP_Hub_Country)
mturk2$IP_Hub_Country<-ifelse(mturk2$IP_Hub_Country_Code=="TH", "Thailand", mturk2$IP_Hub_Country)
mturk2$IP_Hub_Country<-ifelse(mturk2$IP_Hub_Country_Code=="US", "United States", mturk2$IP_Hub_Country)

mturk2$Country_match<-ifelse(mturk2$CountryCoded==mturk2$IP_Hub_Country,1,0)

table(mturk2$CountryCoded, mturk2$IP_Hub_Country)
table(mturk2$Country, mturk2$CountryCoded)
mturk2[mturk2$IP_Hub_VPS==1,]$Country
mturk2[mturk2$IP_Hub_VPS==1,]$IP_Hub_Country

mturk2[mturk2$Country_match==0,]$Country
mturk2[mturk2$Country_match==0,]$State


#Only 3 people with risky IPs (two in US and 1 in India)
table(mturk2$IP_Hub_Country, mturk2$Proxycheck_Risk)

#Only 4 people using VPS and they're in the US plus Sweden
table(mturk2$IP_Hub_Country, mturk2$IP_Hub_VPS)

table(mturk2$IP_Hub_Country, mturk2$Ipintel_BadIP)


#State match using Lat and Lon
State_Geo<-NA

#for(i in 1:302){
#  tryCatch({
#    print(i)
#  State_Geo[i]<-revgeo(longitude=mturk2$Longitude[i], latitude=mturk2$Latitude[i], provider="google", output="frame", 
#         API="Your API Key Here", item="state")$state
#  pause(1)
#  }, error=function(e){})
#}
#State_Geo<-unlist(State_Geo)

#save(State_Geo, file="State_Geo.RData")

load("State_Geo.RData")
mturk2$State_Geo<-State_Geo
mturk2$State_Geo<-ifelse(mturk2$State_Geo=="State Not Found", NA, mturk2$State_Geo)
mturk2$State_Geo<-ifelse(mturk2$State_Geo=="Krung Thep Maha Nakhon", "Bangkok", mturk2$State_Geo)



#Match on State
mturk2$StateCoded<-mturk2$StateUSCoded
mturk2$StateCoded<-ifelse(mturk2$StateCoded=="", mturk2$StateIndiaCoded, mturk2$StateCoded)
mturk2$StateCoded<-ifelse(mturk2$StateCoded=="", mturk2$StateOtherCoded, mturk2$StateCoded)

mturk2$State_IPCheck<-mturk2$Proxycheck_Region
mturk2$State_IPCheck<-ifelse(is.na(mturk2$Proxycheck_Region), mturk2$State_Geo, mturk2$Proxycheck_Region)

#Fix states that cannot be found and need to be manually looked up
mturk2[36,]$State_IPCheck<-"Singapore"
mturk2[44,]$State_IPCheck<-"Georgia"
mturk2[182,]$State_IPCheck<-"Tamil Nadu"

mturk2$State_match<-ifelse(mturk2$StateCoded==mturk2$State_IPCheck,1,0)

statematch<-as.data.frame(table(mturk2$StateCoded, mturk2$State_IPCheck))
statematch<-statematch[statematch$Freq>0,]
colnames(statematch)<-c("State_Reported", "State_IPLocation", "Freq")
statematch$State_Reported<-as.character(statematch$State_Reported)
statematch$State_IPLocation<-as.character(statematch$State_IPLocation)
statematch$State_Match<-ifelse(statematch$State_Reported==statematch$State_IPLocation,1,0)

#Number of unmatched states
sum(statematch[statematch$State_Match==0,]$Freq)
table(mturk2$State_match, mturk2$IP_Hub_Country)

#Table of the actual location of people with state errors in rows; listed location in columns
table(mturk2[mturk2$State_match==0,]$State_IPCheck, mturk2[mturk2$State_match==0,]$StateCoded)
table(mturk2[mturk2$State_match==0,]$CountryCoded, mturk2[mturk2$State_match==0,]$State_IPCheck)
#One person in Kerala claimed to be in Singapore, one person in Luzern (using VPS) claimed to be in India
#One person in Maharashtra claimed to be in the US, Four people in TN claimed to be in US



#Unexpected Response Coding

#Hours worked
mturk2$HoursWorkedUnexpected<-ifelse(mturk2$HoursWorkedMTurk>mturk2$HoursWorked,1,0)
table(mturk2$HoursWorkedUnexpected, mturk2$IP_Hub_Country)

#Attention check
table(mturk2$AttentionCheckInvalid, mturk2$IP_Hub_Country)

#HITs completed
mturk2$HITsCompletedUnexpected<-ifelse(mturk2$HITsCompletedCoded<1000,1,0)
table(mturk2$HITsCompletedUnexpected, mturk2$IP_Hub_Country)

#Typical day description
table(mturk2$IP_Hub_Country, mturk2$TimingUnexpected)

#Typical day not in English
table(mturk2$IP_Hub_Country, mturk2$TimingUSEnglish)

#Elected Offices
mturk2$ElectedOfficesExpected<-mturk2$ElectedOfficesUS+mturk2$ElectedOfficesIndia+
  mturk2$ElectedOfficesOther
mturk2$ElectedOfficesUnexpected<-1-mturk2$ElectedOfficesExpected

table(mturk2$ElectedOfficesUS, mturk2$IP_Hub_Country)

#Largest City
mturk2$LargestCityCloseUS<-ifelse(mturk2$LargestCityUSCoded!="",mturk2$LargestCityClose,0)
table(mturk2$LargestCityCloseUS, mturk2$IP_Hub_Country)

#DateUS
table(mturk2$DateUS, mturk2$IP_Hub_Country)

#HeightUS
table(mturk2$HeightUS, mturk2$IP_Hub_Country)

#LargestCitySizeUS
table(mturk2$LargestCitySizeUS, mturk2$IP_Hub_Country)

#Aggregate Expected US Respondent Score
mturk2$USScore<-(1-mturk2$HoursWorkedUnexpected)+(1-mturk2$AttentionCheckInvalid)+(1-mturk2$HITsCompletedUnexpected)+
  (1-mturk2$TimingUnexpected)+mturk2$TimingUSEnglish+mturk2$ElectedOfficesUS+
  mturk2$LargestCityCloseUS+mturk2$DateUS+mturk2$HeightUS+mturk2$LargestCitySizeUS



#Problem respondents claiming to be in the US
table(mturk2[mturk2$Country_match==0,]$USScore, mturk2[mturk2$Country_match==0,]$Country)

mturk3<-mturk2[mturk2$Country_match==0,]
mturk3.1<-mturk3[mturk3$Country=="USA",]
table(mturk3.1$USScore)

table(mturk3.1$TimingUSEnglish)



#Examine low scoring US responses
mturk4.0<-mturk2[mturk2$IP_Hub_Country=="United States",]

#Just those scoring 8 or less
mturk4<-mturk4.0[mturk4.0$USScore<9,]

table(mturk4$DateUS)
table((1-mturk4$HoursWorkedUnexpected))
table((1-mturk4$AttentionCheckInvalid))
table((1-mturk4$HITsCompletedUnexpected))
table((1-mturk4$TimingUnexpected))
table(mturk4$TimingUSEnglish)
table(mturk4$ElectedOfficesUS)
table(mturk4$LargestCityCloseUS)
table(mturk4$HeightUS)
table(mturk4$LargestCitySizeUS)

table(mturk4$IP_Hub_VPS)

#Examples
mturk4[mturk4$HoursWorkedUnexpected==1,]$HoursWorked
mturk4[mturk4$TimingUnexpected==0,]$TimingText
mturk4[mturk4$ElectedOfficesUS==0,]$ElectedOffices
mturk4[mturk4$LargestCityCloseUS==0,]$LargestCityUSCoded
mturk4[mturk4$HeightUS==0,]$Height
mturk4[mturk4$LargestCitySizeUS==0,]$LargestCitySize





#Examine US respondents scoring a 9
mturk4.1<-mturk4.0[mturk4.0$USScore==9,]

table(mturk4.1$DateUS)
table((1-mturk4.1$HoursWorkedUnexpected))
table((1-mturk4.1$AttentionCheckInvalid))
table((1-mturk4.1$HITsCompletedUnexpected))
table((1-mturk4.1$TimingUnexpected))
table(mturk4.1$TimingUSEnglish)
table(mturk4.1$ElectedOfficesUS)
table(mturk4.1$LargestCityCloseUS)
table(mturk4.1$HeightUS)
table(mturk4.1$LargestCitySizeUS)

table(mturk4.1$IP_Hub_VPS)

#Examples
mturk4.1[mturk4.1$HoursWorkedUnexpected==1,]$HoursWorkedMTurk
mturk4.1[mturk4.1$TimingUSEnglish==0,]$TimingText
mturk4.1[mturk4.1$ElectedOfficesUS==0,]$ElectedOffices
mturk4.1[mturk4.1$LargestCityCloseUS==0,]$LargestCityUSCoded
mturk4.1[mturk4.1$HeightUS==0,]$Height
mturk4.1[mturk4.1$LargestCitySizeUS==0,]$LargestCitySize




#People with "Bad IPs"
#Seem to be perfectly normal responses
table(mturk2$Ipintel_BadIP, mturk2$IP_Hub_Country)
table(mturk2$Ipintel_BadIP, mturk2$USScore)

table(mturk2[mturk2$CountryUS==1,]$Ipintel_BadIP, mturk2[mturk2$CountryUS==1,]$USScore)
table(mturk2[mturk2$CountryIndia==1,]$Ipintel_BadIP, mturk2[mturk2$CountryIndia==1,]$USScore)

mturk5<-mturk2[mturk2$Ipintel_BadIP==1,]


#VPS Respondents
mturk6<-mturk2[mturk2$IP_Hub_VPS==1,]


#Proportion of US respondents starting the survey at 6AM Eastern Time
mturk7<-mturk2[74:302,]
table(mturk7$IP_Hub_Country)/229

